#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# -------------------------------------        Max Joosten         -------------------------------------- #
#-------------------------------------   Last updated: 30.11.2024    -------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 

# 0. Load relevant packages
library(readxl)
library(tidyverse)
library(purrr)
library(vtable)
library(psych) #for function describe
library(foreign)
library(vdemdata) #VDem database
library(expss)       # For adding labels to variables

# Explanation
# The Refinitiv data of North American companies are stored in a different dataset. That is why we repeat the same data wrangling twice (once for OECD, once for NA).
# Later, we merge the two datasets. 

# 0. Set working directory
setwd("")

# 1. Create list of all dataframes 
mysheetlist <- (c("hr1", "hr2", "hr4", "hr5", "hr6", "hr7", "hr8", "hr9", 
                  "soc1", "soc3", "soc6", "soc7", "soc26", "soc30", "soc71", 
                  "soc91", "soc74", "soc76", "soc43", "soc31", "soc2",
                  "gri"))

# 2. Create for loop to read the excel sheets
mysheets_fromexcel <- list()
i=1
for (i in 1:length(mysheetlist)){
  tempdfna <- read_excel(path="", sheet = mysheetlist[i], skip = 3, col_names = T)
  mysheets_fromexcel[[i]] <- tempdfna
  names(mysheets_fromexcel)[i] <- mysheetlist[i]
}

# Step 2: pivot longer, join in one dataframe, recode values, rename
dfna <- imap(mysheets_fromexcel, ~ .x %>%
                mutate(across(matches("\\d{4}$"), as.character)) %>%
                select(-c("Name")) %>%
                pivot_longer(cols = -c("Type", `ESG Company Name`, `ICB SECTOR NAME`), 
                             names_to = "year", values_to = .y)) %>%
  reduce(left_join) %>%
  mutate(across(all_of(names(mysheets_fromexcel)), 
                ~ case_when(.x == "Y" ~ 1, .x == "N" ~ 0))) %>%
  rename(id = Type,
         company = `ESG Company Name`,
         sector = `ICB SECTOR NAME`)

################################################################################
################################################################################
################################################################################
# Add controls
mysheetlist_controls <- (c("sales", "employees", "roa", "debttoassets", "intsales", "randd"))

# Step 2: Create for loop to read the excel sheets
mysheets_fromexcel <- list()
i=1
for (i in 1:length(mysheetlist_controls)){
  tempdfna <- read_excel(path="C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/refinitiv_full_northamerica.xlsx", sheet = mysheetlist_controls[i], skip = 3, col_names = T)
  mysheets_fromexcel[[i]] <- tempdfna
  names(mysheets_fromexcel)[i] <- mysheetlist_controls[i]
}

# Step 2: pivot longer, join in one dataframe, recode values, rename
df_controlsna <- imap(mysheets_fromexcel, ~ .x %>%
                      mutate(across(matches("\\d{4}$"), as.character)) %>%
                      select(-c("Name")) %>%
                      pivot_longer(cols = -c("Type", `ESG Company Name`, `ICB SECTOR NAME`, "ISO CODE -DOMICILE"), 
                                   names_to = "year", values_to = .y)) %>%
  reduce(left_join) %>%
  rename(id = Type,
         company = `ESG Company Name`,
         sector = `ICB SECTOR NAME`,
         cntry = `ISO CODE -DOMICILE`)

# Merge HR, social and control indicators
dfna <- dfna %>% left_join(df_controlsna,
                           by = c("id", "company", "sector", "year"))

# Filter only cases with variables on all
dfna <- dfna %>% 
  filter(!if_all(starts_with(c("hr", "soc")), ~ is.na(.x)))

# There are a few non CA/US companies in the database (1781 observations, including observations from countries that we are not interested in). I dont know why. Going to remove them for now
dfna <- dfna %>% 
  filter(cntry %in% c("CA", "US"))

# Remove other DFs
rm(list=setdiff(ls(), c("dfna", "mysheetlist", "mysheetlist_controls")))


################################################################################
################################################################################
################################################################################
# Step 1: Create for loop to read the excel sheets
mysheets_fromexcel <- list()
i=1
for (i in 1:length(mysheetlist)){
  tempdf <- read_excel(path="C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/refinitiv_full.xlsx", sheet = mysheetlist[i], skip = 3, col_names = T)
  mysheets_fromexcel[[i]] <- tempdf
  names(mysheets_fromexcel)[i] <- mysheetlist[i]
}

# Step 2: pivot longer, join in one dataframe, recode values, rename
df <- imap(mysheets_fromexcel, ~ .x %>%
             mutate(across(matches("\\d{4}$"), as.character)) %>%
             select(-c("Name")) %>%
             pivot_longer(cols = -c("Type", `ESG Company Name`, `ICB SECTOR NAME`), 
                          names_to = "year", values_to = .y)) %>%
  reduce(left_join) %>%
  mutate(across(all_of(names(mysheets_fromexcel)), 
                ~ case_when(.x == "Y" ~ 1, .x == "N" ~ 0))) %>%
  rename(id = Type,
         company = `ESG Company Name`,
         sector = `ICB SECTOR NAME`)


# Step 2: Create for loop to read the excel sheets
mysheets_fromexcel <- list()
i=1
for (i in 1:length(mysheetlist_controls)){
  tempdf <- read_excel(path="C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/refinitiv_full.xlsx", sheet = mysheetlist_controls[i], skip = 3, col_names = T)
  mysheets_fromexcel[[i]] <- tempdf
  names(mysheets_fromexcel)[i] <- mysheetlist_controls[i]
}

# Step 2: pivot longer, join in one dataframe, recode values, rename
df_controls <- imap(mysheets_fromexcel, ~ .x %>%
                      mutate(across(matches("\\d{4}$"), as.character)) %>%
                      select(-c("Name")) %>%
                      pivot_longer(cols = -c("Type", `ESG Company Name`, `ICB SECTOR NAME`, "ISO CODE -DOMICILE"), 
                                   names_to = "year", values_to = .y)) %>%
  reduce(left_join) %>%
  rename(id = Type,
         company = `ESG Company Name`,
         sector = `ICB SECTOR NAME`,
         cntry = `ISO CODE -DOMICILE`)

# Merge HR, social and control indicators
df <- df %>% left_join(df_controls,
                       by = c("id", "company", "sector", "year"))


# Filter only cases with variables on all
df <- df %>% 
  filter(!if_all(starts_with(c("hr", "soc")), ~ is.na(.x)))

# Remove other DFs
rm(list=setdiff(ls(), c("df", "dfna")))

# Merge Refinitiv ESG Europe with Refinitiv ESG North America
df <- df %>% union(dfna)

# Filter only cases with variables on all
df <- df %>% 
  filter(!if_all(starts_with(c("hr", "soc")), ~ is.na(.x)))

# Replace NA with 0. If a company has a value on one variable, but missing on another, we regard this as a 0.
df <- df %>%
  mutate(soc1 = if_else(is.na(soc1), 0, soc1),
         soc3 = if_else(is.na(soc3), 0, soc3),
         soc6 = if_else(is.na(soc6), 0, soc6),
         soc7 = if_else(is.na(soc7), 0, soc7),
         soc26 = if_else(is.na(soc26), 0, soc26),
         soc30 = if_else(is.na(soc30), 0, soc30),
         soc71 = if_else(is.na(soc71), 0, soc71),
         soc91 = if_else(is.na(soc91), 0, soc91),
         soc74 = if_else(is.na(soc74), 0, soc74),
         soc76 = if_else(is.na(soc76), 0, soc76),
         soc43 = if_else(is.na(soc43), 0, soc43),
         soc31 = if_else(is.na(soc31), 0, soc31),
         soc2 = if_else(is.na(soc2), 0, soc2),
         hr1 = if_else(is.na(hr1), 0, hr1),
         hr2 = if_else(is.na(hr2), 0, hr2),
         hr4 = if_else(is.na(hr4), 0, hr4),
         hr5 = if_else(is.na(hr5), 0, hr5),
         hr6 = if_else(is.na(hr6), 0, hr6),
         hr7 = if_else(is.na(hr7), 0, hr7),
         hr8 = if_else(is.na(hr8), 0, hr8),
         hr9 = if_else(is.na(hr9), 0, hr9))


# Control variables and year as numeric
df <- df %>% mutate(year = as.numeric(year),
                    sales = as.numeric(sales),
                    intsales = as.numeric(intsales),
                    employees = as.numeric(employees),
                    roa = as.numeric(roa),
                    debttoassets = as.numeric(debttoassets),
                    randd = as.numeric(randd))

# Remove other DFs
rm(list=setdiff(ls(), c("df")))

# Rename according to new categories
df <- df %>%
  rename(i_csr_team = soc1, 
         i_csr_stakeholder = soc3, 
         i_csr_external = soc6, 
         i_csr_exec_comp = soc7,
         
         i_norm_un_ilo = hr2,
         i_norm_coc = soc26, 
         i_norm_oecd = soc30,
         
         i_policy_hr  = hr6,
         i_policy_hr_care = hr7,
         i_policy_cl = hr4,
         i_policy_fl = hr5,
         i_policy_supplyselect = hr8,
         i_policy_supplyend = hr9,
         i_policy_sc_hs_train = soc71, 
         i_policy_sc_esg_train = soc91, 
         i_policy_sc_hs = soc74, 
         i_policy_sc_hs_imp = soc76,
         
         i_voluntary_eti = hr1,
         i_voluntary_ft = soc43, 
         i_voluntary_eiti = soc31, 
         i_voluntary_gc = soc2)

# Drop 2021 and 2022 because these are still very incomplete
# Check whether by now (september 23) we can use 2022
df <- df %>% filter(!year %in% c(2008, 2009, 2021, 2022, 2023))

######### Add other multi-stakeholder initiatives
# ASI
asi <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "asi") %>%
  select(company, since, end)

df <- df %>% 
  left_join(asi, by = "company") %>% 
  mutate(i_voluntary_asi = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                     TRUE ~ 0)) %>% 
  select(!c(since, end))

# Bonsucro
bonsucro <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "bonsucro") %>%
  select(company, since, end)

df <- df %>% 
  left_join(bonsucro, by = "company") %>% 
  mutate(i_voluntary_bonsucro = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                          TRUE ~ 0)) %>% 
  select(!c(since, end))

# Better Cotton Initiative (BCI)
bci <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "bci") %>%
  select(company, since, end)

df <- df %>% 
  left_join(bci, by = "company") %>% 
  mutate(i_voluntary_bci = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                     TRUE ~ 0)) %>% 
  select(!c(since, end))

# Fair Labor Association (FLA)
fla <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "fla") %>%
  select(company, since, end)

df <- df %>% 
  left_join(fla, by = "company") %>% 
  mutate(i_voluntary_fla = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                     TRUE ~ 0)) %>% 
  select(!c(since, end))

# Global Coffee Program (GCP)
gcp <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "gcp") %>%
  select(company, since, end)

df <- df %>% 
  left_join(gcp, by = "company") %>% 
  mutate(i_voluntary_gcp = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                     TRUE ~ 0)) %>% 
  select(!c(since, end))

# Roundtable on Sustainable Palm Oil (RSPO)
rspo <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "rspo") %>%
  select(company, since, end)

df <- df %>% 
  left_join(rspo, by = "company") %>% 
  mutate(i_voluntary_rspo = case_when(year >= since & (end > year | is.na(end)) ~ 1,
                                      TRUE ~ 0)) %>% 
  select(!c(since, end))

# NOT INCLUDED: BUSINESS DRIVEN STANDARDS
# # ICS
# ics <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "ics") %>%
#   select(company, since, end)
# 
# df <- df %>% 
#   left_join(ics, by = "company") %>% 
#   mutate(i_voluntary_ics = case_when(year >= since & (end > year | is.na(end)) ~ 1,
#                                      TRUE ~ 0)) %>% 
#   select(!c(since, end))
# 
# # PSCI
# psci <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/private_standards.xlsx", sheet = "psci") %>%
#   select(company, since, end)
# 
# df <- df %>% 
#   left_join(psci, by = "company") %>% 
#   mutate(i_voluntary_psci = case_when(year >= since & (end > year | is.na(end)) ~ 1,
#                                           TRUE ~ 0)) %>% 
#   select(!c(since, end))


# Calculate subset means
# CSR related items
df <- df %>% 
  mutate(csr_mean = select(., starts_with("i_csr")) %>%
           rowMeans( na.rm = TRUE))

# Norm related items
df <- df %>% 
  mutate(norm_mean = select(., starts_with("i_norm")) %>%
           rowMeans( na.rm = TRUE))

# Policy related items
df <- df %>% 
  mutate(policy_mean = select(., starts_with("i_policy")) %>%
           rowMeans( na.rm = TRUE))

# Voluntary related items
df <- df %>% 
  mutate(voluntary_mean = select(., starts_with("i_voluntary")) %>%
           rowMeans( na.rm = TRUE))

# Mean where the different categories are not weighted equally important
df <- df %>% 
  mutate(overall_mean = select(., matches("^i_c|^i_n|^i_p")) %>%
           rowMeans( na.rm = TRUE))

# Mean where the different categories are weighted equally important
df <- df %>% 
  mutate(overall_mean_addup = (csr_mean + norm_mean + policy_mean + voluntary_mean)/4)

# Mean score excluding voluntary
df <- df %>% 
  mutate(overall_mean_addup.novol = (csr_mean + norm_mean + policy_mean)/3)

# Mean score excluding voluntary
df <- df %>% 
  mutate(overall_mean_addup.novolnorm = (csr_mean + policy_mean)/2)

# New business model score
df <- df %>% 
  mutate(bmodel_mean_v2 = (i_policy_supplyselect + i_policy_supplyend + i_csr_exec_comp + i_csr_team) / 4)

# New policy score
df <- df %>% 
  mutate(policy_mean_v2 = (i_policy_hr + i_policy_hr_care + i_policy_cl + i_policy_fl + i_policy_sc_hs_train + i_policy_sc_esg_train + i_policy_sc_hs + i_policy_sc_hs_imp) / 8)

# Mean score excluding voluntary
df <- df %>% 
  mutate(overall_mean_addup_v2 = (bmodel_mean_v2 + policy_mean_v2)/2)

# Drop if score == 9999
df <- df %>%
  mutate(across(c(randd,debttoassets,roa,employees,sales,intsales, cntry, sector),
                ~na_if(., 9999)
  ))


# Log employees, sales and international sales
df <- df %>% mutate(employees_log = log(employees),
                    sales_log = log(sales),
                    intsales_log = log(intsales))

# Recode values "NaN" to actual missings
df <- df %>%
  mutate(across(ends_with(c("_log", "_mean")),
                ~na_if(., "NaN")))

#Recode "-Inf" to 0
df <- df %>% 
  mutate(across(ends_with("_log"), 
                ~ case_when(. == "-Inf" ~ 0,
                            . != "-Inf" ~ .)))

# For OECD norms (OECD countries)
df <- df %>% mutate(oecd = case_when(cntry %in% c("AU", "AT", "BE", "CA", "CL", "CZ",
                                                  "CH", "DE", "DK", "EE",
                                                  "ES", "FI", "FR", "GB", "GR", "HU",
                                                  "IE", "IS", "IL",
                                                  "IT", "JP", "KR", 
                                                  "LU", "MX", "NL", "NO", "PL","SK", "SI",
                                                  "NZ", "PT", "SE", "TR", "US") ~ 1,
                                     cntry %in% "LV" & year > 2015 ~ 1,
                                     cntry %in% "LT" & year > 2017 ~ 1,
                                     cntry %in% "CO" & year > 2019 ~ 1,
                                     cntry %in% "CR" & year > 2020 ~ 1,
                                     TRUE ~ 0))

# For EU regulation (EU countries): HR only became EU member in 2014, but since the EU NFD was only transposed into national legislation, it is included with the rest
# At what date was EU legislation not applicable anymore for the UK?
df <- df %>% mutate(eu = case_when(cntry %in% c("AT", "BE", "BG", "CY", "CZ", 
                                                "DK", "EE", "FI", "FR", "DE", "GR", 
                                                "HU", "IE", "IT", "LV", "LT", "LU", 
                                                "MT", "NL", "PL", "PT", "RO", "SK", 
                                                "SI", "ES", "SE", "HR") & year > 2015 & employees > 499 ~ 1,
                                   cntry %in% "GB" & year > 2015 & year < 2021 & employees > 499 ~ 1,
                                   TRUE ~ 0))

# These Excel sheets only specify whether a company is active in France: after merging, for these companies all years are coded as being under the law.
# Therefore in the second part, we code only those instances in which the observation is >2017 and >9999 (in case this varies under and over 10,000 employees)

# French vigilance law (based in France, after 2017, over 5,000 employees & not based in France, over 10,000 employees)
# Import handcoded list of foreign firms subject to Vigilance law (i.e. doing business in FR with >9,999 employees)
# same for california act (these are very few companies..)
vigilance <- read_xlsx("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/foreign_vigilance_cali_flavie0823.xlsx") %>%
  select(company, sector, cntry, vigil_fr, cali, mod_slav)

df <- df %>% left_join(vigilance,
                       by = c("company", "sector", "cntry"))

# Keep the values for the handcoded and code all french companies with over 4999 employees as subject to the law as well.
df <-  df %>% mutate(vigil_fr = case_when(!cntry %in% "FR" & year > 2017 & employees > 9999 ~ vigil_fr, 
                                          cntry %in% "FR" & year > 2017 & employees > 4999 ~ 1,
                                          TRUE ~ 0))

# Three companies have NA, code these as 0
df$vigil_fr[is.na(df$vigil_fr)] <- 0

# UK Modern Slavery Act (based in the UK / operations in UK)
# The threshold is 35999999; I roughly converted to dollars (currently 1 GBP = 1.27 USD; 35999999 GBP = 45848878 USD) and rounded down
df <- df %>% mutate(mod_slav = case_when(!cntry %in% "GB" & year > 2014 & sales > 45000000 ~ mod_slav,
                                         cntry %in% "GB" & year > 2014 & sales > 45000000 ~ 1,
                                         TRUE ~ 0))

# California Transparency Act threshold is 100000000 sales and starting from 2012
df <- df %>% mutate(cali = case_when(year >= 2012 & sales > 100000000 ~ cali,
                                         TRUE ~ 0))



# NFD regulation in:
# France, Grenelle II Article 53 (2009)
# Denmark, Section 99a of the Danish Financial Statements Act (2012)
# UK, Companies Act (2006). 
# ES (2011), see Jackson et al 2020
# NO (2013), see Jackson et al 2020
df <-  df %>% mutate(nonpre_transp = case_when(cntry %in% "FR" ~ 1,
                                               cntry %in% "DK" & year > 2012 ~ 1,
                                               cntry %in% "UK" ~ 1,
                                               cntry %in% "ES" & year > 2010 ~ 1,
                                               cntry %in% "NO" & year > 2012 ~ 1,
                                               TRUE ~ 0))

# Stringency scores
# Need to write from most specific to least specific, if you start with oecd == 1 ~ 3, then all cases have a score of 3.
# Create a data frame of all combinations
# combinations <- expand.grid(
#   oecd = c(0, 1),
#   vigil_fr = c(0, 1),
#   cali = c(0, 1),
#   eu = c(0, 1),
#   mod_slav = c(0, 1),
#   nonpre_transp = c(0, 1)
# )
# 
# # Convert to conditions for case_when
# conditions <- combinations %>%
#   mutate(condition = paste(
#     "oecd == ", oecd, "&",
#     "vigil_fr == ", vigil_fr, "&",
#     "cali == ", cali, "&",
#     "eu == ", eu, "&",
#     "mod_slav == ", mod_slav, "&",
#     "nonpre_transp == ", nonpre_transp, "~ NA"
#   )) %>%
#   pull(condition)
# 
# # Print conditions
# writeLines(conditions)


df <- df %>%
  mutate(stringency = case_when(
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 0,
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 3,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 8,
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 11,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 6,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 9,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 4,
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 7,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 6,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 9,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 9, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 3, 
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 6, 
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 8, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 11,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 6, 
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 9, 
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 9, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 4, 
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 7, 
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 9, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 6, 
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 9, 
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 9, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  0 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 3, 
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 6, 
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 8, 
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 11,
  
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 6,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 9,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  0 & nonpre_transp ==  1 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 4,
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 7,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 6,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 9,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  0 & nonpre_transp ==  1 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 3,
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 6,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 8,
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 11,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 6,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 9,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  0 & mod_slav ==  1 & nonpre_transp ==  1 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 4,
    oecd ==  1 & vigil_fr ==  0 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 7,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  0 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 12,
    
    oecd ==  0 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 8,
    oecd ==  1 & vigil_fr ==  0 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 11,
    
    oecd ==  0 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 9,
    oecd ==  1 & vigil_fr ==  1 & cali ==  1 & eu ==  1 & mod_slav ==  1 & nonpre_transp ==  1 ~ 12,
    TRUE ~ 0
  ))


df <- df %>% mutate(ensemble = oecd + vigil_fr + cali + mod_slav + eu + oecd + nonpre_transp)

# Allow outcome variables to vary between 0 and 100 for interpretation
df <- df %>%
  mutate(across(c(starts_with("i_") | contains("_mean")), ~.*100))

# There are a few companies without country
# Drop some companies that have no data on "cntry" and are double or not OECD or other motivation
df <- df %>% filter(!company %in% c("AAREAL BANK V", "DEUTSCHE POST", "ENCAVIS", "INTERTRUST",  
                                    "MTU AERO ENGINES HLDG.", "SLM SOLUTIONS GROUP V", "MING YANG SMART ENERGY GROUP GDR REG S", "BEE TEAM"))

# Some companies have missing on country but can be identified
df <- df %>% mutate(cntry = case_when(company %in% "GREENVOLT ENERGIAS RENOVAVEIS S A" ~ "PT",
                                      company %in% "BANK LINTH LLB" ~ "CH",
                                      company %in% "BOBST GROUP" ~ "CH",
                                      company %in% "BPOST SUSP - SUSP.09/12/22" ~ "BE",
                                      company %in% "CONTOURGLOBAL" ~ "GB",
                                      company %in% "JOULES GROUP SUSP - 14/11/2022" ~ "GB",
                                      company %in% "SOURCEBIO INTERNATIONAL" ~ "GB",
                                      company %in% "VIFOR PHARMA" ~ "CH",
                                      company %in% "BIOVENTUS A" ~ "US",
                                      company %in% "COLOSSUS MINERALS (OTC)" ~ "CA",
                                      company %in% "MERCATOR MRLS. (OTC)" ~ "CA",
                                      company %in% "POSEIDON CONCEPTS (OTC)" ~ "US",
                                      company %in% "SAN GOLD" ~ "CA",
                                      company %in% "SEARS CANADA (OTC)" ~ "CA",
                                      company %in% "TELMEX 'L' (OTC)" ~ "MX",
                                      company %in% "TIMMINCO (OTC)" ~ "US",
                                      company %in% "COMMONWEALTH PR. (OTC)" ~ "US",
                                      TRUE ~ cntry))

df <- df %>% mutate(sector = case_when(company %in% "LENTA" ~ "Retailers",
                                       TRUE ~ sector))

# What sectors do we expect effects?
df <- df %>% mutate(sector_effect = case_when(sector %in% c("Household Goods and Home Construction",
                                                            "Telecommunications Equipment",
                                                            "Food Producers",
                                                            "Electronic and Electrical Equipment",
                                                            "Pharmaceuticals and Biotechnology",
                                                            "Alternative Energy",
                                                            "Retailers",
                                                            "Medical Equipment and Services",
                                                            "Construction and Materials",
                                                            "Technology Hardware and Equipment",
                                                            "Personal Care, Drug and Grocery Stores",
                                                            "Industrial Metals and Mining",
                                                            "Chemicals",
                                                            "Personal Goods",
                                                            "Aerospace and Defense",
                                                            "Oil, Gas and Coal",
                                                            "Industrial Materials",
                                                            "Automobiles and Parts",
                                                            "General Industrials",
                                                            "Precious Metals and Mining",
                                                            "Beverages",
                                                            "Gas, Water and Multi-utilities",
                                                            "Tobacco",
                                                            "Open End and Miscellaneous Investment Vehicles",
                                                            "Industrial Transportation",
                                                            "Industrial Engineering",
                                                            "Consumer Services",
                                                            "Software and Computer Services",
                                                            "Travel and Leisure",
                                                            "Electricity",
                                                            "Industrial Support Services",
                                                            "Health Care Providers",
                                                            "Waste and Disposal Service Providers",
                                                            "Media",
                                                            "Leisure Goods") ~ 1,
                                              TRUE ~ 0))

# Lag ROA and Debt one year
df <- 
  df %>%
  group_by(company) %>%
  mutate(roa_lag = dplyr::lag(roa, n = 1, default = NA),
         dta_lag = dplyr::lag(debttoassets, n = 1, default = NA))

# The first year a company appears in the data is always missing because it does not have a lagged value
# Take the value of the actual year as the lagged value
df <-
  df %>%
  mutate(roa_lag = if_else(is.na(roa_lag), roa, roa_lag),
         dta_lag = if_else(is.na(dta_lag), debttoassets, dta_lag))

# There are 15 duplicate company-years, this removes these
df <- df %>%
  distinct(company, year, .keep_all = TRUE)


## Add VDem Civil Society Repression variable
# Load data using the vdem package (variable v2csreprss)
vdemcs <- vdem %>% select(country_name, year, v2csreprss) %>%
  filter(year > 2009)

# Add CY averages
df <- df %>% 
  group_by(cntry, year) %>% 
  mutate(overall_mean_addup_v2_cymean = mean(overall_mean_addup_v2, na.rm = T),
         bmodel_mean_v2_cymean = mean(bmodel_mean_v2, na.rm = T),
         policy_mean_v2_cymean = mean(policy_mean_v2, na.rm = T))

# Recode country names to iso2 codes to merge with our data
load("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data/CIA_iso_conversion.RData")

# Syncronize countrynames to merge later on (US and KR have slightly different names)
vdemcs$country_name <- plyr::mapvalues(vdemcs$country_name, from = c("United States of America", "South Korea"),
                                       to = c("United States", "Korea, South"))

vdemcs$cntry <- plyr::mapvalues(vdemcs$country_name, 
                            from = country.codes$country.name, 
                            to = country.codes$iso2)

# Drop country_name
vdemcs <- vdemcs %>% select(!country_name)

# Merge data
df <- df %>%
  left_join(vdemcs, by = c("cntry", "year"))

rm(vdemcs)

# If GRI is missing, then score as 0
df<-df %>% 
  dplyr::mutate(gri = case_when(is.na(gri) ~ 0,
                                TRUE ~ gri))

# Create seperate dataset with only companies that are available across the entire time-period (2010 - 2020, 11 observations)
# In addition, code laggards/middle/top companies in 2010
df_ay <- df %>% 
  group_by(company) %>%
  mutate(n = n()) %>% 
  filter(n == 11) %>% ungroup %>%
  group_by(cntry, year) %>% 
  arrange(cntry, year, desc(overall_mean_addup_v2)) %>% 
  mutate(meanyear = mean(overall_mean_addup_v2),
         
         # 20/60/20
         bot20year = quantile(overall_mean_addup_v2, .2),
         top20year = quantile(overall_mean_addup_v2, .8),
         bot20 = case_when(overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .2) ~ 1,
                           overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .2) ~ 0),
         top20 = case_when(overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .8) ~ 1,
                           overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .8) ~ 0),
         mid60 = case_when((overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .2) &  overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .8)) ~ 1,
                           (overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .2) | overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .8)) ~ 0),
         # 33/33/33
         bot33year = quantile(overall_mean_addup_v2, .33),
         top33year = quantile(overall_mean_addup_v2, .67),
         bot33 = case_when(overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .33) ~ 1,
                           overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .33) ~ 0),
         top33 = case_when(overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .67) ~ 1,
                           overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .67) ~ 0),
         mid33 = case_when((overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .33) &  overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .67)) ~ 1,
                           (overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .33) | overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .67)) ~ 0),
         
         
         bot10year = quantile(overall_mean_addup_v2, .10),
         top10year = quantile(overall_mean_addup_v2, .90),
         bot10 = case_when(overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .10) ~ 1,
                           overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .10) ~ 0),
         top10 = case_when(overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .90) ~ 1,
                           overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .90) ~ 0),
         mid80 = case_when((overall_mean_addup_v2 > quantile(overall_mean_addup_v2, .10) &  overall_mean_addup_v2 < quantile(overall_mean_addup_v2, .90)) ~ 1,
                           (overall_mean_addup_v2 <= quantile(overall_mean_addup_v2, .10) | overall_mean_addup_v2 >= quantile(overall_mean_addup_v2, .90)) ~ 0)) %>% ungroup() %>%
  group_by(company) %>%
  mutate(bot2010 = bot20[year == 2010],
         top2010 = top20[year == 2010],
         mid2010 = mid60[year == 2010],
         
         bot2010_33 = bot33[year == 2010],
         top2010_33 = top33[year == 2010],
         mid2010_33 = mid33[year == 2010],
         
         bot2010_10 = bot10[year == 2010],
         top2010_10 = top10[year == 2010],
         mid2010_80 = mid80[year == 2010]) %>% ungroup()


# Create interaction term
df_ay <- df_ay %>%
  mutate(botmidtop_int = case_when(bot2010 == 1 ~ 1,
                                   mid2010 == 1 ~ 2,
                                   top2010 == 1 ~ 3))

# Recode to dummy for individual analysis, create supply chain measure
df$i_csr_stakeholder.d <- df$i_csr_stakeholder/100
df$i_csr_exec_comp.d <- df$i_csr_exec_comp/100
df$i_policy_fl.d <- df$i_policy_fl/100
df <- df %>% mutate(sc_mean = (i_policy_supplyselect + i_policy_supplyend + i_policy_sc_hs_train + i_policy_sc_hs_imp + i_policy_sc_hs + i_policy_sc_esg_train)/6)

# Specify the aspects of legislation
df <- df %>%
  mutate(normrisk = case_when(oecd == 1 ~ 1, TRUE ~ 0),
         normriskapp = case_when(oecd == 1 ~ 1, TRUE ~ 0),
         normscresp = case_when(oecd == 1 ~ 1, TRUE ~ 0),
         
         lawencrisk = case_when(cali == 1 | mod_slav == 1 | eu == 1 | vigil_fr == 1 ~ 1, TRUE ~ 0),
         lawencriskapp = case_when(cali == 1 | mod_slav == 1 | eu == 1 | vigil_fr == 1 ~ 1, TRUE ~ 0),
         lawencscresp = case_when(cali == 1 | mod_slav == 1 | eu == 1 | vigil_fr == 1 ~ 1, TRUE ~ 0),
         
         lawoblrisk = case_when(cali == 1 | vigil_fr == 1 ~ 1, TRUE ~ 0),
         lawoblriskapp = case_when(cali == 1 | vigil_fr == 1 ~ 1, TRUE ~ 0),
         lawoblriskprev = case_when(vigil_fr == 1 ~ 1, TRUE ~ 0),
         lawoblriskcorract = case_when(vigil_fr == 1 ~ 1, TRUE ~ 0),
         
         pubenf = case_when(cali == 1 | eu == 1 ~ 1, TRUE ~ 0),
         liability = case_when(vigil_fr == 1 ~ 1, TRUE ~ 0))


 # Save data
setwd("C:/Users/MAJOOST/OneDrive - UvA/postdoc_shared/02. data")
save(df,
     file = "ref_v1.RData")

df$year <- as.factor(df$year)
df$sector <- as.factor(df$sector)
df$cntry <- as.factor(df$cntry)

# Label variables for Stata output (only those appearing in the models)
df <- apply_labels(df,
                   cntry = "Country",
                   sector = "Sector",
                   year = "Year",
                   stringency = "Stringency",
                   overall_mean_addup = "Overall",
                   csr_mean = "CSR",
                   norm_mean = "Norm",
                   policy_mean = "Policy",
                   voluntary_mean = "Voluntary",
                   employees_log = "Employees (log)",
                   roa_lag = "Return on assets (lagged 1 year)",
                   dta_lag = "Debt to assets (lagged 1 year)",
                   ensemble = "Number of legislations")

write.dta(df, 
          file = "ref_v1.dta")
save(df_ay,
     file = "ref_complete_v2.RData")

# Save as Stata file (recode as factors)
df_ay$year <- as.factor(df_ay$year)
df_ay$sector <- as.factor(df_ay$sector)
df_ay$cntry <- as.factor(df_ay$cntry)
df_ay$i_csr_stakeholder <- df_ay$i_csr_stakeholder/100
df_ay$i_csr_exec_comp <- df_ay$i_csr_exec_comp/100
df_ay$i_policy_fl <- df_ay$i_policy_fl/100
df_ay <- df_ay %>% mutate(sc_mean = (i_policy_supplyselect + i_policy_supplyend + i_policy_sc_hs_train + i_policy_sc_hs_imp + i_policy_sc_hs + i_policy_sc_esg_train)/6)

# Label variables for Stata output (only those appearing in the models)
df_ay <- apply_labels(df_ay,
                   cntry = "Country",
                   sector = "Sector",
                   year = "Year",
                   stringency = "Stringency",
                   overall_mean_addup = "Overall",
                   csr_mean = "CSR",
                   norm_mean = "Norm",
                   policy_mean = "Policy",
                   voluntary_mean = "Voluntary",
                   employees_log = "Employees (log)",
                   roa_lag = "Return on assets (lagged 1 year)",
                   dta_lag = "Debt to assets (lagged 1 year)",
                   ensemble = "Number of legislations")

write.dta(df_ay,
     file = "ref_complete_v2.dta")

rm(list=ls())
